/*==============================================================================
FILE NAME: Process_Incidents.do
INPUTS: incidents.dta
OUTPUTS: incidents_clean.dta
==============================================================================*/

/* Set directory if working independently through code */
if c(username)=="" { //insert username
    global rootdir "" // insert root path
    global processed_data "$rootdir/processed_data"  // Define global paths for replication package
} 

// Prevent Stata from pausing output
set more off

// Load raw incidents data
use "$processed_data/incidents.dta", clear

// Report duplicate rows
duplicates report 

* no duplicates.

// Drop unnecessary columns
drop PHYSICALADDRESS PHYSICALADDRESS2 STATE ZIPEXT PHYSICALLOCATION STATELOCATION RegulatedEntityCounty FREQ_CD DUR_TYP_CD Program RCV_WTR_BODY_TXT IncidentComments ACTV_CODES INV_STATUS_DT INV_STATUS_DT STATUS_CD STATUS_DT RESOL_DESC_TXT TYP_CD CIT_DISPLAY_TXT INCID_STATUS_DT

// Convert ZIP columns to numeric
destring ZIP ZIPLOCATION, replace

// Flag and fill missing ZIP codes using alternate column
gen ZipFlag_CIN = 1 if ZIP == . & ZIPLOCATION != .
replace ZIP = ZIPLOCATION if ZIP == . & ZIPLOCATION != .

* CURRENT ASSUMPTION: ZIPLOCATION = ALT RN LOCATION. QUESTION/DISCUSSION = CN LOCATION = owner location? PROBLEM? TRADE-OFF = MAX ZIP LOCATIONS WITHIN TEXAS vs. WANTING TO LEARN SOMETHING ABOUT DEMOS SURROUNDING INCIDENTS.

// Drop alternate ZIP column
drop ZIPLOCATION

** Texas Zipcode range - structnet.com/instructions/zip_min_max_by_state.html 

// Restrict ZIP codes to valid Texas range
replace ZIP = . if ZIP < 73301 
replace ZIP = . if ZIP > 88589

// Fill missing city values using NEARESTCITY
replace CITY = NEARESTCITY if CITY == "" & NEARESTCITY != ""

// Drop NEARESTCITY column
drop NEARESTCITY

// Rename columns for clarity
rename TCEQRegion TCEQRegion_CIN
rename ZIP ZipCode_CIN
rename CNTY County_CIN
rename CITY City_CIN
rename RegulatedEntity RN
rename CustomerNumber CN
rename PrincipalCustomer CustomerName
rename NUM_COMPLAINING_NUM Number_Of_Complaints /*Maybe?*/
rename ComplaintIncident CIN

// Clean up Number_Of_Complaints values
replace Number_Of_Complaints = . if Number_Of_Complaints == 0
replace Number_Of_Complaints = . if Number_Of_Complaints == 1600

// Convert incident date to Stata date format
gen temp = date(IncidentRecDate,"MDY")
drop IncidentRecDate 
rename temp IncidentRecDate
format IncidentRecDate %td
replace IncidentRecDate = . if IncidentRecDate < 0

// Convert investigation number to numeric
destring INV_NUM, replace

// Rename investigation number column
rename INV_NUM IN

// Create media-specific incident indicators
gen incident_air = 1 if Media == "AIR"
replace incident_air = 0 if Media == "WATER" | Media == "WASTE"
gen incident_water = 1 if Media == "WATER"
replace incident_water = 0 if Media == "AIR" | Media == "WASTE"
gen incident_waste = 1 if Media == "WASTE"
replace incident_waste = 0 if Media == "AIR" | Media == "WATER"

// Drop media column
drop Media

// Convert investigation start date to Stata date format
gen temp = date(INV_START_DT,"MDY")
drop INV_START_DT 
rename temp InvestigationStartDate
format InvestigationStartDate %td

// Rename violation track number column
rename TRACK_NUM VN 

// Drop customer columns and remove duplicates
drop CN CustomerName
duplicates drop

// Flag if investigation start date is before incident date
gen date_flag = 1 if InvestigationStartDate < IncidentRecDate 

// Create incident indicator
gen incident = 1

// Summarize media-specific incident indicators
sum incident_waste incident_water incident_air

// Extract day, month, and year from incident date
gen day = day(IncidentRecDate)
gen month = month(IncidentRecDate)
gen year = year(IncidentRecDate)

// Save cleaned incidents data
save "$processed_data/incidents_clean.dta", replace